Lendo os dados

resultados_avaliacoes = read_avaliacoes()
## Parsed with column specification:
## cols(
##   id = col_character(),
##   item = col_character(),
##   municipio = col_character(),
##   criterio = col_character(),
##   aproach = col_character(),
##   date = col_datetime(format = ""),
##   valid = col_logical(),
##   contNodeNumberAccess = col_double(),
##   found = col_logical(),
##   pathSought = col_character(),
##   durationMin = col_double(),
##   duration = col_double(),
##   tipo_exp = col_character()
## )
resultados_avaliacoes[is.na(resultados_avaliacoes)] <- ""

gararito = read_gabaritos()
## Parsed with column specification:
## cols(
##   municipio = col_character(),
##   criterio = col_character(),
##   item = col_character(),
##   encontrado = col_logical(),
##   local_encontrado = col_character(),
##   local_encontrado_2 = col_character()
## )
gararito[is.na(gararito)] <- ""

empresas_portais <- readr::read_csv(here::here("data/empresas_portais.csv"))
## Warning: Missing column names filled in: 'X8' [8], 'X9' [9], 'X10' [10],
## 'X11' [11], 'X12' [12], 'X13' [13], 'X14' [14], 'X15' [15], 'X16' [16],
## 'X17' [17], 'X18' [18]
## Parsed with column specification:
## cols(
##   municipio = col_character(),
##   link_portal_transp = col_character(),
##   link_prefeitura = col_character(),
##   observacoes = col_character(),
##   fornecedor = col_character(),
##   tipo_fornecer = col_character(),
##   `Fornecedor: Gestões Anteriores` = col_character(),
##   X8 = col_character(),
##   X9 = col_logical(),
##   X10 = col_logical(),
##   X11 = col_logical(),
##   X12 = col_logical(),
##   X13 = col_logical(),
##   X14 = col_logical(),
##   X15 = col_logical(),
##   X16 = col_logical(),
##   X17 = col_logical(),
##   X18 = col_character()
## )

Adicionando combinação encontrada em cada município no gabarito

empresas_portais <- empresas_portais %>% 
    select(municipio, fornecedor)

gararito<-left_join(gararito, empresas_portais, by=c("municipio"))

Juntando Avaliações e Gabaritos

# concatena os dois csv o do gabarito e avaliações do crawler
data<-full_join(resultados_avaliacoes, gararito, by=c("municipio", "item", "criterio"))

Sumarizando as avaliações

precisao <- data %>% 
    group_by(municipio, criterio, item, aproach, date) %>% 
    mutate(
           
           #verifica se a avaliação foi acertiva
           tp = valid == TRUE 
           & valid == encontrado 
           #valida se no gabarito e na avaliação o item foi encontrado na mesma url 
           & (grepl(local_encontrado, pathSought) |
                  grepl(local_encontrado_2, pathSought)),
           
           fn =  valid == FALSE 
           & encontrado == TRUE,
           
           fp = valid == TRUE 
           & encontrado == FALSE
          )

precisao %>%
    datatable(options = list(pageLength = 10),  rownames = FALSE, class = 'cell-border stripe')
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

Quantificando métricas

metricas_result <- precisao %>% 
    #filter(!is.na(aproach )) %>% 
    group_by(municipio, aproach, date) %>% 
    summarise(
        tp_total = sum(tp), 
        fn_total = sum(fn),
        fp_total = sum(fp),
        
        #cálculo das métricas 
        recall = tp_total/(tp_total + fn_total),
        precision =  tp_total/(tp_total + fp_total),
        f1_score = (2*(recall*precision))/(recall+precision),
        
        #tempo das avaliações
        median_duration_min = median(durationMin),
        median_duration = median(duration),
        max_duration = max(duration),
        max_durationMin = max(durationMin),
        median_num_access_node = median(contNodeNumberAccess),
        max_num_access_node = max(contNodeNumberAccess),
        all_access_node = sum(contNodeNumberAccess),
        combination = last(fornecedor),
        tipo_exp = last(tipo_exp)
    )

metricas_result %>% 
    write_csv(here::here("data/resultados_sumarizado.csv"))

metricas_result 
## # A tibble: 162 x 18
## # Groups:   municipio, aproach [89]
##    municipio aproach date                tp_total fn_total fp_total recall
##    <chr>     <chr>   <dttm>                 <int>    <int>    <int>  <dbl>
##  1 Alcantil  bandit  2019-11-19 04:04:19       39        2        2  0.951
##  2 Alcantil  bfs     2019-11-11 22:58:13       39        2        2  0.951
##  3 Alcantil  bfs     2019-11-28 02:39:38       39        2        2  0.951
##  4 Alcantil  dfs     2019-11-22 15:33:46       40        1        3  0.976
##  5 Arara     bandit  2019-11-01 23:51:14       36        9        4  0.8  
##  6 Arara     bandit  2019-11-11 14:33:27       36        9        3  0.8  
##  7 Arara     bfs     2019-11-02 19:43:10       36        9        4  0.8  
##  8 Arara     bfs     2019-11-06 02:44:42       36        9        4  0.8  
##  9 Arara     bfs     2019-11-09 23:00:57       36        9        3  0.8  
## 10 Arara     bfs     2019-11-28 02:39:38       39        6        3  0.867
## # … with 152 more rows, and 11 more variables: precision <dbl>, f1_score <dbl>,
## #   median_duration_min <dbl>, median_duration <dbl>, max_duration <dbl>,
## #   max_durationMin <dbl>, median_num_access_node <dbl>,
## #   max_num_access_node <dbl>, all_access_node <dbl>, combination <chr>,
## #   tipo_exp <chr>

Avaliações por abordagem

metricas_result %>%
    group_by(aproach) %>% 
    summarise(ocorrencia = n()) %>%
    ggplot(aes(y=ocorrencia, x=reorder(aproach, +(ocorrencia)))) + 
    geom_bar(stat = "identity",  fill="#5499C7") + 
    ggtitle("Número de Avaliações por Abordagem") +
    xlab("Abordagem") + 
    ylab("Número de avaliações") +
    coord_flip()

Avaliações por valor do Recall e Precision

metricas_result %>%
    ggplot(aes(x=recall)) + 
    geom_histogram(alpha=0.5, position="identity", bins=20) 

metricas_result %>%
    ggplot(aes(x=precision)) + 
    geom_histogram(alpha=0.5, position="identity", bins=20) 

metricas_result %>%
    ggplot(aes(x=recall, color=aproach)) + 
    geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
    facet_grid(aproach ~ .)

metricas_result %>%
    ggplot(aes(x=precision, color=aproach)) + 
    geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
    facet_grid(aproach ~ .)

Avaliações por tempo (Min)

metricas_result %>%
    ggplot(aes(x=median_duration_min, color=aproach)) + 
    geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
    facet_grid(aproach ~ .)

metricas_result %>%
    ggplot(aes(x=max_durationMin, color=aproach)) + 
    geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
    facet_grid(aproach ~ .)

Avaliações por número de nós acessados

metricas_result %>%
    ggplot(aes(x=median_num_access_node, color=aproach)) + 
    geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
    facet_grid(aproach ~ .)

metricas_result %>%
    ggplot(aes(x=max_num_access_node, color=aproach)) + 
    geom_histogram(fill='white', alpha=0.5, position="identity", bins=20) +
    facet_grid(aproach ~ .)

Avaliações com Recall abaixo de 0.7

Todas as Avaliações

metricas_result %>%
    group_by(municipio) %>%
    summarise(bfs = sum(aproach == 'bfs'), dfs = sum(aproach == 'dfs'), bandit = sum(aproach == 'bandit'), tipo_exp=last(tipo_exp)) %>%
    arrange(desc(dfs)) %>%
    datatable(options = list(pageLength = 10),  rownames = FALSE, class = 'cell-border stripe')
metricas_result %>%
    select(municipio, aproach, date, recall, precision, f1_score) %>%
    arrange(desc(recall)) %>% 
    datatable(options = list(pageLength = 30),  rownames = FALSE, class = 'cell-border stripe')

Métricas

metricas_result %>%
    ggplot() + 
    geom_point(aes(x=aproach, y=recall,  color=aproach), position = "jitter")  

metricas_result %>%
  ggplot(aes(x='',y = recall)) +
  geom_boxplot(fill = "white") +
  geom_jitter(aes(color = aproach), alpha=0.5, size=3)

metricas_result %>%
    ggplot(aes(x = aproach, y = recall)) +
    geom_boxplot(fill = "orange") 

metricas_result %>%
    ggplot() + 
    geom_point(aes(x=aproach, y=f1_score,  color=aproach), position = "jitter")  

Tempo de Duração

metricas_result %>%
    ggplot() + 
    geom_point(aes(x=aproach, y=median_duration_min,  color=aproach), position = "jitter")  

metricas_result %>%
    ggplot() + 
    geom_point(aes(x=aproach, y=max_durationMin,  color=aproach), position = "jitter")  

metricas_result %>%
  ggplot(aes(x='',y = max_durationMin)) +
  geom_boxplot(fill = "white") +
  geom_jitter(aes(color = aproach), alpha=0.5, size=3)

metricas_result %>%
    ggplot(aes(x = aproach, y = max_durationMin)) +
    geom_boxplot(fill = "orange") 

Nós Acessados

metricas_result %>%
    ggplot() + 
    geom_point(aes(x=aproach, y=max_num_access_node,  color=aproach), position = "jitter")  

metricas_result %>%
  ggplot(aes(x='',y = max_num_access_node)) +
  geom_boxplot(fill = "white") +
  geom_jitter(aes(color = aproach), alpha=0.5, size=3)

metricas_result %>%
    ggplot(aes(x = aproach, y = max_num_access_node), ) +
    geom_boxplot(fill = "orange") 

IC

set.seed(123)

f1_score <- function (d, i) {
    dt<-d[i,]
    c(
        dt$f1_score
    )
}
    
bootstraps <- boot(data = metricas_result, 
                   statistic = f1_score, # <- referência para a função 
                   R = 4000) # número de bootstraps


ci.tb = tidy(bootstraps, 
          conf.level = .95,
          conf.method = "basic",
          conf.int = TRUE) 


glimpse(ci.tb)
## Observations: 162
## Variables: 5
## $ statistic <dbl> 0.9512195, 0.9512195, 0.9512195, 0.9523810, 0.8470588, 0.85…
## $ bias      <dbl> -0.06600196, -0.06585157, -0.06682167, -0.06549871, 0.03858…
## $ std.error <dbl> 0.06160066, 0.06024994, 0.06339111, 0.06184019, 0.05980092,…
## $ conf.low  <dbl> 0.9294661, 0.9294661, 0.9294661, 0.9317889, 0.7211447, 0.74…
## $ conf.high <dbl> 1.1137066, 1.1166490, 1.1167247, 1.1190476, 0.9053853, 0.92…
ci.tb %>%
    ggplot(aes(x = "", y = statistic,
               ymin = conf.low,
               ymax = conf.high)) +
    geom_pointrange() +
    geom_point(size = 3) +
    labs(y = "F1-score",
         x = "") +
    theme(axis.title = element_text(size=10)) 

#Calcula a media das posições escolhidas nas buscas.
set.seed(123)

f1_score_boot <- function (d, i) {
    dt<-d[i,]
    c(
        dt$f1_score
    )
}

boot.aproach <- metricas_result %>%
  group_by(aproach) %>% 
  mutate(cors_boot = list(
      boot(
          data = metricas_result, 
          statistic = f1_score_boot, 
          R = 4000
          )
      )
     )

ics.aproach <- boot.aproach %>% 
    group_by(aproach) %>% 
    summarise(
        ci = list(tidy(cors_boot[[1]], 
          conf.level = .95,
          conf.method = "basic",
          conf.int = TRUE))
    ) %>% 
    unnest(ci) 


ics.aproach %>%
    ggplot(aes(x = aproach, y = statistic,
               ymin = conf.low,
               ymax = conf.high)) +
    geom_pointrange() +
    geom_point(size = 3) +
    labs(y = "F1-score",
         x = "") +
    theme(axis.title = element_text(size=10)) 

metricas_result %>%
ggplot() +
  geom_boxplot(aes(x=combination, y=max_num_access_node), fill = "white")  +
  geom_point(alpha = 0.4, aes(x=combination, y=max_num_access_node, color=aproach), position = "jitter") +
  coord_flip()

metricas_result %>%
ggplot() +
  geom_boxplot(aes(x=combination, y=max_durationMin), fill = "white")  +
  geom_point(alpha = 0.4, aes(x=combination, y=max_durationMin, color=aproach), position = "jitter") +
  coord_flip()

metricas_result %>%
  filter(combination == 'Publicsoft') %>% 
  ggplot() +
  geom_boxplot(aes(x=combination, y=max_durationMin), fill = "white")  +
  geom_point(alpha = 0.4, aes(x=combination, y=max_durationMin, color=aproach), position = "jitter")